CISC 520 Final Project
Wusi Fan
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
# Import recipe data file as a DataFrame
recipes_df = pd.read_csv("RAW_recipes.csv", encoding="utf-8")
## drop rows with NAN value
recipes_df_rm_na = recipes_df.dropna()
## convert submit date column to datetime format
recipes_df_rm_na.loc[:,'submitted']= pd.to_datetime(recipes_df_rm_na['submitted'])
## only include recipe uploaded before Dec 8, 2017 - so each recipe has to be uploaded at least for a year
recipes_df_rm_na = recipes_df_rm_na.loc[pd.DatetimeIndex(recipes_df_rm_na.loc[:,"submitted"]).year<2018,:]
recipes_df_rm_na.info()
recipes_df_rm_na.head()
## define functions used to do Frequent Pattern Mining
def list2list (ls):
ls2 = ls.strip("][").split(', ')
return ls2
def name2list (ls):
ls2 = ls.strip("][").split(' ')
return ls2
### using the mlxtend package for Frequent Pattern Mining
### reference: ttp://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/
def fpm (df,column_name,min_supt,split_func):
## convert data into correct format (list of lists)
ls = df.loc[:,column_name].apply(split_func)
ls = ls.tolist()
#print(ls[0:10])
## create T/F table indicating weather an item appears each row
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(ls).transform(ls)
df2 = pd.DataFrame(te_ary, columns=te.columns_)
## return the items and itemsets with at least X% support:
from mlxtend.frequent_patterns import apriori
print(apriori(df2, min_support=min_supt, use_colnames=True).sort_values("support", ascending = False))
## frequent patter of Ingredients
fpm(recipes_df_rm_na,"ingredients",0.1,list2list)
## frequent patter of steps
fpm(recipes_df_rm_na,"tags",0.6,list2list)
## frequent patter of name
fpm(recipes_df_rm_na,"name",0.03,name2list)
Recipe Nutrition - Caloiro level
The nutrition field in the raw data is unusable due to lack of documentation, however, the author of the dataset uploaded another data file that includes a Caloiro level, which can be mapped to the raw data file as representative of nutrition level.
## read caloiro data
caloiro_df = pd.read_csv("PP_recipes.csv", index_col = "id", encoding="utf-8")
## map caloiro level to receipe table
recipes_df_rm_na.loc[:,"caloiro_level"]=recipes_df_rm_na.id.map(caloiro_df.loc[:,"calorie_level"])
recipes_df_rm_na.head()
The time when a recipe is uploaded could potentially impact how much review it receives. For example, a receipe uploaded during weekend and holiday seasons might receive more views as more people are looking for cooking suggestion during that period.
## submit year
recipes_df_rm_na.loc[:,"submit_year"] = pd.DatetimeIndex(recipes_df_rm_na.loc[:,"submitted"]).year
## submit month
recipes_df_rm_na.loc[:,"submit_month"] = pd.DatetimeIndex(recipes_df_rm_na.loc[:,"submitted"]).month
## submit day of month
recipes_df_rm_na.loc[:,"day_of_month"] = pd.DatetimeIndex(recipes_df_rm_na.loc[:,"submitted"]).day
## submit day of week
recipes_df_rm_na.loc[:,'day_of_week'] = recipes_df_rm_na.loc[:,'submitted'].dt.to_period('D').dt.dayofweek
recipes_df_rm_na.head()
recipes_df_rm_na.describe()
## Check out how many recipes are submitted by year
recipes_df_rm_na.groupby("submit_year")["id"].count()
The complexity may impact people's willingness to read and follow an instruction, and therefore change the likelyhood of a recipe receive reviews.
LIX of Instruction
LIX is a measue of Readability, on a scale from 0 to 100, the higher the score is, the less readable the text is. it can be computed based on the formula below:
Reference: Christoph Trattner et, al. 2018 https://epjdatascience.springeropen.com/articles/10.1140/epjds/s13688-018-0149-5#Sec10
## length of instruction
recipes_df_rm_na.loc[:,"instruction_length"] = recipes_df_rm_na.loc[:,'steps'].apply(len)
## LIX of instruction
### create a function to calculate LIX:
def lix(text):
nw = len(text.split())
nlw = len([len(i) for i in text.split() if len(i)>6])
ns = len(text.split(","))
try:
rlw = nlw/nw
asl = nw/ns
lix = 100* rlw+asl
return lix
except:
return np.nan
### calculate LIX of instruction
recipes_df_rm_na.loc[:,"instruction_lix"] = recipes_df_rm_na.loc[:,'steps'].apply(lix)
recipes_df_rm_na.head()
Similar to instruction, the complexity of recipe name, description may have influence on how likely a recipe receive views.
## length of title
recipes_df_rm_na.loc[:,"title_length"] = recipes_df_rm_na.loc[:,'name'].apply(len)
## length of description
recipes_df_rm_na.loc[:,"description_length"] = recipes_df_rm_na.loc[:,'description'].apply(len)
## LIX of description
recipes_df_rm_na.loc[:,"description_lix"] = recipes_df_rm_na.loc[:,'description'].apply(lix)
## Sentiment of description
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
### define a formula to calculated sentimental score
def senti(text):
analyzer = SentimentIntensityAnalyzer()
senti_score = analyzer.polarity_scores(text)["compound"]
return senti_score
### calculate sentimental score of description
recipes_df_rm_na.loc[:,"description_senti"] = recipes_df_rm_na.loc[:,'description'].apply(senti)
recipes_df_rm_na.head()
## drop rows with NAN value
recipes_df_rm_na = recipes_df_rm_na.dropna()
recipes_df_rm_na.info()
## Save feature engineering result to csv file as back up
recipes_df_rm_na.to_csv("feature_engineering_result.csv",index=True)
In this analysis, considering data available, the number of reviews received by a recipe is used as it's popularity index.
Since a recipe can keep receiving review during a long period after it's initial upload. Three time periods (a week, a month and a year)are used here to capture review received within those time periods.
For regression:
For classification:
# Step 1.
## Import review data file as a DataFrame
review_df = pd.read_csv("RAW_interactions.csv", encoding="utf-8")
review_df.head()
## convert review date column to datetime format
review_df.loc[:,"date"]= pd.to_datetime(review_df.loc[:,"date"])
## create a recipe_submit_date table for mapping
recipe_submit_date = recipes_df_rm_na.loc[:,["id","submitted"]]
## map submit date to review table
review_df = review_df.merge(recipe_submit_date, left_on="recipe_id", right_on="id")
review_df.info()
## calculate days between recipe submission and review
review_df.loc[:,"days_after_submit"] = review_df.loc[:,"date"] - review_df.loc[:,"submitted"]
## create bollean columns for review period
review_df.loc[:,"within 7-day"] = (review_df.loc[:,"days_after_submit"] <=datetime.timedelta(7))*1
review_df.loc[:,"within 30-day"] = (review_df.loc[:,"days_after_submit"] <=datetime.timedelta(30))*1
review_df.loc[:,"within 365-day"] = (review_df.loc[:,"days_after_submit"] <=datetime.timedelta(365))*1
review_df.head()
# Step 2.
## calculate how many reviews in each time period using groupby
review_counts = review_df.groupby('recipe_id').agg({"within 7-day":"sum", "within 30-day":"sum","within 365-day":"sum"})
## create boolean columns to indicate if a recipe received review in the time period or not
review_boolean = (review_counts>0)*1
# Step 3.
## marge review_counts and review_boolean to the recipe table
recipes_df_final_temp = recipes_df_rm_na.merge(review_counts, left_on="id",right_index=True)
recipes_df_final = recipes_df_final_temp.merge(review_boolean, left_on="id",right_index=True, suffixes=("_count", "_boolean"))
recipes_df_final.head()
## save processed data
recipes_df_final.to_csv("model_ready.csv",index=True)
## for viz: calculate total number of recipes by review count in each time perio
review_counts_7_day = pd.DataFrame(review_counts.groupby("within 7-day")["within 7-day"].count())
review_counts_30_day = pd.DataFrame(review_counts.groupby("within 30-day")["within 30-day"].count())
review_counts_365_day = pd.DataFrame(review_counts.groupby("within 365-day")["within 365-day"].count())
## for viz: put review counts from 3 time period together
review_counts_agg = review_counts_7_day.merge(review_counts_30_day, left_index= True, right_index= True, how="outer").merge(review_counts_365_day, left_index= True, right_index= True,how="outer")
review_counts_agg
## visualizaton
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Bar(
x=review_counts_agg.index.values,
y=review_counts_agg["within 7-day"],
name='7-day',
marker_color='indianred'
))
fig.add_trace(go.Bar(
x=review_counts_agg.index.values,
y=review_counts_agg["within 30-day"],
name='30-day',
marker_color='lightsalmon'
))
fig.add_trace(go.Bar(
x=review_counts_agg.index.values,
y=review_counts_agg["within 365-day"],
name='365-day',
marker_color='grey'
))
fig.update_layout(
title="Number of recipes by count of reviews reciped in 7-day, 30-day, 365-day period",
xaxis_title="Count of Reviews",
yaxis_title="Number of Recipes",
font=dict(
family="Courier New, monospace",
size=12,
color="#7f7f7f")
)
fig.show()
# Here we modify the tickangle of the xaxis, resulting in rotated labels.
# fig.update_layout(barmode='group', xaxis_tickangle=-45)
## turn datafrom into long format (for plotting)
review_counts_agg_long = pd.melt(review_counts, value_vars=['within 7-day', 'within 30-day','within 365-day'])
review_counts_agg_long
import plotly.express as px
fig = px.box(review_counts_agg_long, x="variable", y="value",
notched=True, # used notched shape
title="Box plot of review counts",
)
fig.show()